In this R Notebook I implemented an Convolutional Neural Network (CNN) using the MNIST Database for handwritten digits recognition using MXNet framework for R.

Setup

You will need to install the MXNet for R and, if you intent to use your GPU card, the NVidia CUDA Drivers.

Download all four dataset files from MNIST site and gunzip them in the project directory.

Finally, load the libraries.

library(mxnet)    # ann framework
library(magrittr) # to use modeling the framework
library(caret)    # to use to check the performace

Loading dataset

We’ll use an adaptation of gist from Brendan o’Connor to read the files transforming them in a structure simple to use and access.

# read function returns a list of datasets
load_mnist <- function() {
  load_image_file <- function(filename) {
    ret = list()
    f = file(filename,'rb')
    readBin(f,'integer',n=1,size=4,endian='big')
    ret$n = readBin(f,'integer',n=1,size=4,endian='big')
    nrow = readBin(f,'integer',n=1,size=4,endian='big')
    ncol = readBin(f,'integer',n=1,size=4,endian='big')
    x = readBin(f,'integer',n=ret$n*nrow*ncol,size=1,signed=F)
    ret$x = matrix(x, ncol=nrow*ncol, byrow=T)
    close(f)
    ret
  }
  load_label_file <- function(filename) {
    f = file(filename,'rb')
    readBin(f,'integer',n=1,size=4,endian='big')
    n = readBin(f,'integer',n=1,size=4,endian='big')
    y = readBin(f,'integer',n=n,size=1,signed=F)
    close(f)
    y
  }
  train <- load_image_file('./data/train-images-idx3-ubyte')
  test <- load_image_file('./data/t10k-images-idx3-ubyte')
  
  train$y <- load_label_file('./data/train-labels-idx1-ubyte')
  test$y <- load_label_file('./data/t10k-labels-idx1-ubyte')  
  
  return(
    list(
      train = train,
      test = test
    )
  )
}

# plot one case
show_digit <- function(arr784, col=gray(12:1/12), ...) {
  image(matrix(arr784, nrow=28)[,28:1], col=col, ...)
}

# load
mnist <- load_mnist()

Let’s check the dataset

labels <- paste(mnist$train$y[1:5],collapse = ", ")
par(mfrow=c(1,5), mar=c(0.1,0.1,0.1,0.1))
for(i in 1:5) show_digit(mnist$train$x[i,])

Labels: 5, 0, 4, 1, 9

Convolutional Neural Network

LeNet

LeNet CNN Architecture

LeNet CNN Architecture

Magrittr

# pipe assign function
# example:  rnorm(5,mean=5) %>% sqrt() %=>% "varname" %>% mean()

"%=>%" <- function(val,var) {
  assign(substitute(var),val, envir = .GlobalEnv)
  return(val)
}

Neural Network

# input data
lenet <- mx.symbol.Variable("data") %>%
  
  # Convolutional Layer Set 1 ( Conv > Tanh > Pool )
  mx.symbol.Convolution( kernel=c(5,5), num_filter=20, name="Conv1" )  %=>% "Conv1" %>%
  mx.symbol.Activation( act_type="tanh", name="Act1" )                 %=>% "Act1" %>%
  mx.symbol.Pooling( pool_type="max", kernel=c(2,2), 
                     stride=c(2,2), name = "Pool1")                    %=>% "Pool1" %>%
  
  # Convolutional Layer Set 1 ( Conv > Tanh > Pool )
  mx.symbol.Convolution( kernel=c(5,5), num_filter=50 , name="Conv2")  %=>% "Conv2" %>%
  mx.symbol.Activation( act_type="tanh", name="Act2" )                 %=>% "Act2" %>%
  mx.symbol.Pooling( pool_type="max", kernel=c(2,2),
                     stride=c(2,2), name = "Pool2")                    %=>% "Pool2" %>%
  
  # Flat representation 50 2D filters -> 1D Array
  mx.symbol.flatten( name="Flat")                                      %=>% "Flat1" %>%
  
  # Fully Connected Layer 1
  mx.symbol.FullyConnected( num_hidden=500, name="Full1" )             %=>% "Full1" %>%
  mx.symbol.Activation( act_type="tanh", name="Act3" )                 %=>% "Act3" %>%
  
  # Fully Connected Layer 1
  mx.symbol.FullyConnected( num_hidden=10 , name="Full2")              %=>% "Full2" %>%
  mx.symbol.SoftmaxOutput(name="SoftM")                                %=>% "SoftM" 

Checking the model built

graph.viz( lenet, direction = "LR" )

Training

# Resizing the dataset from 10000 x 784 to (28 x 28) x 1 x 100000

# train
tr.x <- t(mnist$train$x)
dim(tr.x) <- c(28,28,1,ncol(tr.x))

# test
ts.x <- t(mnist$test$x)
dim(ts.x) <- c(28,28,1,ncol(ts.x))

# training
logger.epoc <- mx.callback.log.train.metric(100)
logger.batch <- mx.metric.logger$new()
mx.set.seed(42)  # the life, the universe and everything
ti <- proc.time()
model <- mx.model.FeedForward.create(lenet, 
                                     X=tr.x, 
                                     y=mnist$train$y,
                                     eval.data=list(
                                       data=ts.x, 
                                       label=mnist$test$y),
                                     ctx=mx.cpu(), 
                                     num.round=20, 
                                     array.batch.size=100,
                                     learning.rate=0.05, 
                                     momentum=0.9, 
                                     wd=0.00001,
                                     eval.metric=mx.metric.accuracy,
                                     epoch.end.callback=logger.epoc,
                                     batch.end.callback=mx.callback.log.train.metric(1, logger.batch))
te <- proc.time()
print(te-ti)
##    user  system elapsed 
## 3186.33 1775.99 1726.14
mx.model.save(model, "mnistModel",1)

Train evolution

plot(logger.batch$train, main = "Train Evolution", ylab="Accuracy", xlab="Batchs")

Evaluation

Confusion Matrix

# process the validation dataset
outputs <- predict(model,ts.x)

# the output is a 10 x 10000 matrix
# transpose to transform in a tidy dataset ( cases x result ) 
t_outputs <- t(outputs)

# the last layer is a softmax agregator
# so, each column of the dataset is de probability of a value from 0 to 9
# lets get the biggest probability for each test case
y_hat <- max.col(t_outputs)-1 # base index is 1

cm <- confusionMatrix(y_hat,mnist$test$y)
cm$table
##           Reference
## Prediction    0    1    2    3    4    5    6    7    8    9
##          0  974    0    1    0    0    2    3    0    2    0
##          1    1 1131    0    0    0    0    2    2    0    0
##          2    0    0 1025    1    1    0    0    2    0    0
##          3    0    0    0 1000    0    5    1    1    1    2
##          4    0    0    1    0  970    0    0    1    0    6
##          5    1    0    0    6    0  878    4    0    2    2
##          6    3    2    1    0    1    2  947    0    0    1
##          7    1    0    2    0    0    0    0 1016    0    3
##          8    0    2    2    3    0    4    1    1  966    1
##          9    0    0    0    0   10    1    0    5    3  994

Overal Performance

cm$overall
##       Accuracy          Kappa  AccuracyLower  AccuracyUpper   AccuracyNull 
##      0.9901000      0.9889958      0.9879601      0.9919467      0.1135000 
## AccuracyPValue  McnemarPValue 
##      0.0000000            NaN

Visualizing the worst cases

Worst Case

# found where the network most fail
errors <- cm$table
diag(errors) <- 0   

worst <- which( errors==max(errors), arr.ind = T) - 1

worst
##   Prediction Reference
## 9          9         4

Finding mismatching cases

worst.idx <- mnist$test$y == worst[1,"Reference"] & y_hat == worst[1,"Prediction"]
worstcases <- mnist$test$x[worst.idx,]

par(mfrow=c(2,5), mar=c(0.1,0.1,0.1,0.1))
for(i in 1:10) show_digit(worstcases[i,])

Indeed some cases are remarkable difficult to identify as five, like the 2nd, 3rd, 6th, 9th and 10th.

wpred <- t_outputs[ worst.idx, ]
par(mfrow=c(2,5), mar=c(0.1,0.1,0.1,0.1))
for(i in 1:10) barplot(wpred[i,])

Inspecting the network layers

Binding and Feed Forward

# use the layer's references to build a group symbol
# create an executor to controls the network
out <- mx.symbol.Group(c(Conv1, Act1, Pool1, Conv2, Act2, Pool2, Flat1, Full1, Full2, SoftM))
executor <- mx.simple.bind(symbol = out,  data=dim(ts.x), ctx=mx.cpu())

# transfer the arguments and parameters learned
mx.exec.update.arg.arrays(executor, model$arg.params, match.name = T)
mx.exec.update.aux.arrays(executor, model$aux.params, match.name = T)

# prepare the input
mx.exec.update.arg.arrays(executor, list(data=mx.nd.array(ts.x)), match.name=TRUE)

# Feedforward: propagates the input to output throught the network
mx.exec.forward(executor, is.train=FALSE)

# list the output elements
names(executor$ref.outputs)
##  [1] "Conv1_output" "Act1_output"  "Pool1_output" "Conv2_output"
##  [5] "Act2_output"  "Pool2_output" "Flat_output"  "Full1_output"
##  [9] "Full2_output" "SoftM_output"

Convolution Layer One

Conv Filters

# choosing the first worst case
j <- which( worst.idx==T )[1]

# Conv1 Filters
par(mfrow=c(4,5), mar=c(0.1,0.1,0.1,0.1))
for (i in 1:20) {
  outputData <- as.array(executor$ref.outputs$Conv1_output)[,,i,j]
  image(outputData[,24:1],
        xaxt='n', yaxt='n')
}

Activation

par(mfrow=c(4,5), mar=c(0.1,0.1,0.1,0.1))
for (i in 1:20) {
  outputData <- as.array(executor$ref.outputs$Act1_output)[,,i,j]
  image(outputData[,24:1],
        xaxt='n', yaxt='n')
}

Pooling

par(mfrow=c(4,5), mar=c(0.1,0.1,0.1,0.1))
for (i in 1:20) {
  outputData <- as.array(executor$ref.outputs$Pool1_output)[,,i,j]
  image(outputData[,12:1],
        xaxt='n', yaxt='n')
}

Convolutional Layer Two

Filters

par(mfrow=c(7,7), mar=c(0.1,0.1,0.1,0.1))
for (i in 1:49) {
  outputData <- as.array(executor$ref.outputs$Conv2_output)[,,i,j]
  image(outputData[,8:1],
        xaxt='n', yaxt='n')
}

Activations

par(mfrow=c(7,7), mar=c(0.1,0.1,0.1,0.1))
for (i in 1:49) {
  outputData <- as.array(executor$ref.outputs$Act2_output)[,,i,j]
  image(outputData[,8:1],
        xaxt='n', yaxt='n')
}

Pooling

par(mfrow=c(7,7), mar=c(0.1,0.1,0.1,0.1))
for (i in 1:49) {
  outputData <- as.array(executor$ref.outputs$Pool2_output)[,,i,j]
  image(outputData[,4:1],
        xaxt='n', yaxt='n')
}